Purpose of project and Dataset details

This mini-project explores summary data from a clinical trial on Dapagliflozin (NCT01400884) in type 2 diabetes. It includes data cleaning, statistical summaries, AE profiling, survival analysis, and interactive visualizations, mimicking real-world workflows in pharmacovigilance and clinical reporting.

Tools and Packages used

Data Handling: tidyverse, janitor, here

Reporting: gtsummary, gt, Tplyr

Visualization: ggplot2, patchwork, plotly, survminer

Interactivity: DT, shiny

Clinical Packages: admiral, survival

Phase 1: Data Cleaning and Structuring

In this phase, we load and preprocess the clinical trial data. The focus is on standardizing variable names, correcting data types, and formatting the datasets for downstream analysis.

# Load libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(here)
## Warning: package 'here' was built under R version 4.4.3
## here() starts at C:/Users/simranpreet/OneDrive - Nottingham Trent University
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.4.3
library(gt)
## Warning: package 'gt' was built under R version 4.4.3
library(flextable)
## Warning: package 'flextable' was built under R version 4.4.3
## 
## Attaching package: 'flextable'
## 
## The following objects are masked from 'package:ggpubr':
## 
##     border, font, rotate
## 
## The following object is masked from 'package:gtsummary':
## 
##     continuous_summary
## 
## The following object is masked from 'package:purrr':
## 
##     compose
library(shiny)
## Warning: package 'shiny' was built under R version 4.4.3
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
## 
## Attaching package: 'DT'
## 
## The following objects are masked from 'package:shiny':
## 
##     dataTableOutput, renderDataTable
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## 
## The following objects are masked from 'package:flextable':
## 
##     highlight, style
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(cardx)
## Warning: package 'cardx' was built under R version 4.4.3
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.4.3
library(officer)
## Warning: package 'officer' was built under R version 4.4.3
library(webshot)
## Warning: package 'webshot' was built under R version 4.4.3
library(tinytex)
## Warning: package 'tinytex' was built under R version 4.4.3
library(survival)
## Warning: package 'survival' was built under R version 4.4.3
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
library(admiral)
## Warning: package 'admiral' was built under R version 4.4.3
library(Tplyr)
## Warning: package 'Tplyr' was built under R version 4.4.3
library(remotes)
## Warning: package 'remotes' was built under R version 4.4.3
library(bslib)
## Warning: package 'bslib' was built under R version 4.4.3
## 
## Attaching package: 'bslib'
## 
## The following object is masked from 'package:utils':
## 
##     page
# Load and clean datasets
baseline <- read_csv(here("C:/Users/simranpreet/OneDrive - Nottingham Trent University/clinical_trial_NCT01400884/clinical-trial-summary-real/data", "baseline.csv"), show_col_types = FALSE) %>% clean_names()
ae_summary <- read_csv(here("C:/Users/simranpreet/OneDrive - Nottingham Trent University/clinical_trial_NCT01400884/clinical-trial-summary-real/data", "ae_summary.csv"), show_col_types = FALSE) %>% clean_names()

# Clean baseline data
baseline_clean <- baseline %>%
  mutate(
    treatment_arm = factor(treatment_arm),
    sex_male_percent = as.numeric(sex_male_percent),
    sex_female_percent = as.numeric(sex_female_percent),
    bmi_mean = round(bmi_mean, 1)
  ) %>%
  select(treatment_arm, n, age_mean, age_sd, sex_male_percent, sex_female_percent, bmi_mean)

# Clean AE data
ae_clean <- ae_summary %>%
  mutate(
    treatment_arm = factor(treatment_arm),
    ae_category = str_to_title(ae_category),
    ae_severity = factor(ae_severity, levels = c("Mild", "Moderate", "Severe")),
    percent_events = round(percent_percent, 1)
  ) %>%
  select(treatment_arm, ae_category, ae_severity, n_events, percent_events)

# Save cleaned files
if (!dir.exists(here("outputs", "cleaned"))) {
  dir.create(here("outputs", "cleaned"), recursive = TRUE)
}

write_csv(baseline_clean, here("outputs", "cleaned", "baseline_clean.csv"))
write_csv(ae_clean, here("outputs", "cleaned", "ae_summary_clean.csv"))

Phase 2: Statistical Summary and Visualizations

This phase transforms cleaned data into meaningful statistical tables and plots to summarize baseline characteristics, adverse events, and treatment outcomes.

Objectives: - Compare demographics (age, BMI) across treatment arms. - Explore adverse event distributions by severity. - Generate Kaplan-Meier survival curves.

# Load cleaned data
baseline_clean <- read_csv(here("outputs", "cleaned", "baseline_clean.csv"), show_col_types = FALSE)
ae_clean <- read_csv(here("outputs", "cleaned", "ae_summary_clean.csv"), show_col_types = FALSE)

# Summary statistics with gtsummary
baseline_clean %>%
  tbl_summary(by = treatment_arm, 
              statistic = list(all_continuous() ~ "{mean} ({sd})", 
                               all_categorical() ~ "{n} ({p}%)")) %>%       
  add_p() %>%
  modify_header(label ~ "**Variable**") %>%
  bold_labels()
Variable Dapagliflozin 10mg
N = 1
1
Dapagliflozin 5mg
N = 1
1
Placebo
N = 1
1
p-value2
n


>0.9
    97 1 (100%) 0 (0%) 0 (0%)
    98 0 (0%) 1 (100%) 0 (0%)
    100 0 (0%) 0 (0%) 1 (100%)
age_mean


>0.9
    57.6 1 (100%) 0 (0%) 0 (0%)
    58.2 0 (0%) 0 (0%) 1 (100%)
    59.1 0 (0%) 1 (100%) 0 (0%)
age_sd


>0.9
    8.7 0 (0%) 1 (100%) 0 (0%)
    9.1 0 (0%) 0 (0%) 1 (100%)
    9.3 1 (100%) 0 (0%) 0 (0%)
sex_male_percent


>0.9
    55 0 (0%) 0 (0%) 1 (100%)
    58 1 (100%) 0 (0%) 0 (0%)
    60 0 (0%) 1 (100%) 0 (0%)
sex_female_percent


>0.9
    40 0 (0%) 1 (100%) 0 (0%)
    42 1 (100%) 0 (0%) 0 (0%)
    45 0 (0%) 0 (0%) 1 (100%)
bmi_mean


>0.9
    30.9 1 (100%) 0 (0%) 0 (0%)
    31.5 0 (0%) 0 (0%) 1 (100%)
    32.1 0 (0%) 1 (100%) 0 (0%)
1 n (%)
2 Fisher’s exact test
# Simulate survival data (replace with real data if available)
set.seed(100)
survival_data <- tibble(
  treatment_arm = rep(c("A", "B"), each = 50),
  time = round(rexp(100, rate = 0.1), 1),
  status = sample(0:1, 100, replace = TRUE)
)

# Kaplan-Meier plot
km_fit <- survfit(Surv(time, status) ~ treatment_arm, data = survival_data)

km_plot <- ggsurvplot(
  km_fit,
  data = survival_data,
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  title = "Kaplan-Meier Survival by Treatment Arm",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.title = "Treatment"
)
km_plot

# Chi-square test
chisq_table <- ae_clean %>%
  count(treatment_arm, ae_category) %>%
  pivot_wider(names_from = ae_category, values_from = n, values_fill = 0)

chisq.test(chisq_table[,-1])
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chisq_table[, -1]
## X-squared = 0, df = 2, p-value = 1
chisq_table
## # A tibble: 3 × 3
##   treatment_arm      Headache Nausea
##   <chr>                 <int>  <int>
## 1 Dapagliflozin 10mg        1      1
## 2 Dapagliflozin 5mg         1      1
## 3 Placebo                   1      1
fisher.test(chisq_table[,-1])
## 
##  Fisher's Exact Test for Count Data
## 
## data:  chisq_table[, -1]
## p-value = 1
## alternative hypothesis: two.sided
# AE plot
ae_plot <- ggplot(ae_clean, aes(x = ae_category, y = n_events, fill = ae_severity)) +
  geom_col(position = "dodge") +
  facet_wrap(~ treatment_arm) +
  labs(title = "Adverse Events by Treatment Arm", x = "AE Category", y = "Number of Events") +
  theme_minimal()
ae_plot

# Patchwork layout for age and BMI
age_plot <- ggplot(baseline_clean, aes(x = treatment_arm, y = age_mean)) +
  geom_col(fill = "skyblue") +
  labs(title = "Mean Age by Treatment Arm", y = "Mean Age")

bmi_plot <- ggplot(baseline_clean, aes(x = treatment_arm, y = bmi_mean)) +
  geom_col(fill = "orange") +
  labs(title = "Mean BMI by Treatment Arm", y = "Mean BMI")

age_plot + bmi_plot

# Flextable report
baseline_clean %>%
  group_by(treatment_arm) %>%
  summarise(
    N = sum(n),
    Age_Mean = round(mean(age_mean, na.rm = TRUE), 1),
    BMI_Mean = round(mean(bmi_mean, na.rm = TRUE), 1),
    Male_Percent = round(mean(sex_male_percent, na.rm = TRUE), 1),
    Female_Percent = round(mean(sex_female_percent, na.rm = TRUE), 1)
  ) %>%
  flextable() %>%
  set_header_labels(
    treatment_arm = "Treatment Arm",
    N = "Sample Size",
    Age_Mean = "Mean Age",
    BMI_Mean = "Mean BMI",
    Male_Percent = "% Male",
    Female_Percent = "% Female"
  ) %>%
  autofit()

Treatment Arm

Sample Size

Mean Age

Mean BMI

% Male

% Female

Dapagliflozin 10mg

97

57.6

30.9

58

42

Dapagliflozin 5mg

98

59.1

32.1

60

40

Placebo

100

58.2

31.5

55

45

#tplyr table 
tplyr_table <- tplyr_table(baseline_clean, treatment_arm) %>%
  add_layer(
    group_desc(age_mean, by = "Age (Mean ± SD)")
  ) %>%
  add_layer(
    group_desc(bmi_mean, by = "BMI (Mean ± SD)")
  ) %>%
  build()

# Print table
print(tplyr_table)
## # A tibble: 12 × 8
##    row_label1      row_label2 `var1_Dapagliflozin 10mg` `var1_Dapagliflozin 5mg`
##    <chr>           <chr>      <chr>                     <chr>                   
##  1 Age (Mean ± SD) n          "  1"                     "  1"                   
##  2 Age (Mean ± SD) Mean (SD)  "57.60 (      )"          "59.10 (      )"        
##  3 Age (Mean ± SD) Median     "57.60"                   "59.10"                 
##  4 Age (Mean ± SD) Q1, Q3     "57.60, 57.60"            "59.10, 59.10"          
##  5 Age (Mean ± SD) Min, Max   "57.6, 57.6"              "59.1, 59.1"            
##  6 Age (Mean ± SD) Missing    "  0"                     "  0"                   
##  7 BMI (Mean ± SD) n          "  1"                     "  1"                   
##  8 BMI (Mean ± SD) Mean (SD)  "30.90 (      )"          "32.10 (      )"        
##  9 BMI (Mean ± SD) Median     "30.90"                   "32.10"                 
## 10 BMI (Mean ± SD) Q1, Q3     "30.90, 30.90"            "32.10, 32.10"          
## 11 BMI (Mean ± SD) Min, Max   "30.9, 30.9"              "32.1, 32.1"            
## 12 BMI (Mean ± SD) Missing    "  0"                     "  0"                   
## # ℹ 4 more variables: var1_Placebo <chr>, ord_layer_index <int>,
## #   ord_layer_1 <int>, ord_layer_2 <int>

Phase 3: Interactive Shiny Dashboard

To explore the interactive dashboard for the clinical trial summary, please open and run the app.R file separately in RStudio.

The dashboard includes: - Baseline data explorer by treatment arm - Adverse Events bar charts - Biomarker violin plots - Kaplan-Meier survival curves

Summary

This interactive RMarkdown report summarizes clinical trial findings with embedded visualizations and tables. Use the tabs to explore each section.

Data Loading

# Simulated biomarker data
set.seed(123)
biomarker_data <- expand.grid(
  treatment_arm = c("Placebo", "Dapagliflozin 5mg", "Dapagliflazin 10mg"),
  subject_id = 1:50
) %>%
  mutate(
    responder_status = sample(c("Responder", "Non-Responder"), n(), replace = TRUE),
    biomarker_level = round(rnorm(n(), mean = ifelse(responder_status == "Responder", 5.5, 4.2), sd = 1), 2)
  )

# Simulated baseline data
baseline_clean <- data.frame(
  treatment_arm = sample(c("Placebo","Dapagliflozin 5mg", "Dapagliflazin 10mg"), 150, replace = TRUE),
  age_mean = rnorm(150, mean = 60, sd = 10),
  bmi_mean = rnorm(150, mean = 27, sd = 4)
)

# Simulated AE data
ae_clean <- data.frame(
  treatment_arm = sample(c("Placebo", "Dapagliflozin 5mg", "Dapagliflazin 10mg"), 100, replace = TRUE),
  ae_category = sample(c("Headache", "Nausea", "Fatigue"), 100, replace = TRUE),
  n_events = sample(1:10, 100, replace = TRUE),
  ae_severity = sample(c("Mild", "Moderate", "Severe"), 100, replace = TRUE)
)

# Simulated survival data
surv_data <- data.frame(
  time = rexp(150, 0.1),
  status = sample(0:1, 150, replace = TRUE),
  treatment = baseline_clean$treatment_arm)

Interactive Widgets

Baseline Table

DT::datatable(baseline_clean,
              options = list(pageLength = 5, scrollX = TRUE),
              caption = "Interactive Table: Baseline Characteristics")

Adverse Events Plot

ae_plot_static <- ggplot(ae_clean, aes(x = ae_category, y = n_events, fill = ae_severity)) +
  geom_col(position = "dodge") +
  facet_wrap(~ treatment_arm) +
  labs(title = "Adverse Events by Treatment Arm", x = "Adverse Event", y = "Event Count") +
  theme_minimal()

plotly::ggplotly(ae_plot_static)

Biomarker Violin Plot

biomarker_violin <- ggplot(biomarker_data, aes(x = responder_status, y = biomarker_level, fill = responder_status)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.1, fill = "white") +
  facet_wrap(~ treatment_arm) +
  labs(title = "Biomarker Levels by Responder Status", x = "Response", y = "Biomarker Level") +
  theme_minimal()

plotly::ggplotly(biomarker_violin)

Kaplan-Meier Survival Plot

km_fit <- survfit(Surv(time, status) ~ treatment, data = surv_data)

survminer::ggsurvplot(km_fit, data = surv_data, pval = TRUE, risk.table = TRUE, 
                      ggtheme = theme_minimal(), risk.table.y.text.col = TRUE)


Note: Shiny dashboards require a separate runtime environment and are not rendered directly in this report.

Conclusion

This RMarkdown report simulates a clinical trial summary workflow. It contains high-quality tables, survival plots, and interactive widgets

End of report